# -*- coding: utf-8 -*-
"""
Created on Thu Jun 16 17:10:45 2022
Proprocess environmental complaints for LDA in the Water Subset
@author: samir
"""

import pandas as pd
import topicmodels
import topicmodels.preprocess
import os
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
from paths import target_path #importing target_path from paths.py


def main():
    print("Running: Water LDA Preprocess Lemmatize")

if __name__ == "__main__":
    main()

# Change to relevant directory
if os.path.exists(target_path):
    os.chdir(target_path)
else:
    print(f"Warning: {target_path} does not exist!")
    
###Import Data
data = pd.read_table("Water_Subset/Raw_Data/water_complaints_with_year.txt", encoding="utf-8")
docsobj = topicmodels.RawDocs(data.IncidentDescription, "long")

##Get initial wordcount
all_tokens = [s for d in docsobj.tokens for s in d]
print("number of unique tokens = %d" % len(set(all_tokens)))
print("number of total tokens = %d" % len(all_tokens))

#Remove tokens of length one
docsobj.token_clean(1)

#Wordcount after token clean
all_tokens = [s for d in docsobj.tokens for s in d]
print("number of unique tokens = %d" % len(set(all_tokens)))
print("number of total tokens = %d" % len(all_tokens))

#Remove stopwords
docsobj.stopword_remove("tokens")

#Wordcount after stopword removal
all_tokens = [s for d in docsobj.tokens for s in d]
print("number of unique tokens = %d" % len(set(all_tokens)))
print("number of total tokens = %d" % len(all_tokens))

###Extract tokens from the docsobj
dataframe = pd.DataFrame()
dataframe['IncidentDescription'] = (docsobj.tokens)

###Define function to modify PoS tag so the lemmatizer can read them, returns 
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return wordnet.NOUN #Noun if part of speech is not one of the basic ones 

###Apply PoS tag
dataframe['IncidentDescriptionTagged'] = dataframe['IncidentDescription'].apply(pos_tag) 

###Lemmatize the dataset
dataframe['IncidentDescriptionLemma'] = ""
wnl = WordNetLemmatizer()
for idx, row in dataframe.iterrows():
        for word, tags in dataframe.loc[idx, 'IncidentDescriptionTagged']:
                dataframe.at[idx, 'IncidentDescriptionLemma'] = dataframe.at[idx, 'IncidentDescriptionLemma'] + wnl.lemmatize(word, pos = pos_tagger(tags)) + " "

###Add in identifying information
Info_frame = data.drop('IncidentDescription', axis = 1)
Merged_frame = pd.concat([Info_frame, dataframe], axis = 1)

###Save as csv
Merged_frame.to_csv(r'Water_Subset/Processed_Data/water_Complaints_lemmatized.csv', index=False)
















